home *** CD-ROM | disk | FTP | other *** search
- /* ARC - Archive utility - ARCLZW
-
- $define(tag,$$segment(@1,$$index(@1,=)+1))#
- $define(version,Version $tag(
- TED_VERSION DB =1.39), created on $tag(
- TED_DATE DB =07/02/85) at $tag(
- TED_TIME DB =10:36:00))#
- $undefine(tag)#
- $version
-
- (C) COPYRIGHT 1985 by System Enhancement Associates; ALL RIGHTS RESERVED
-
- By: Thom Henderson
-
- Description:
- This file contains the routines used to implement Lempel-Zev
- data compression, which calls for building a coding table on
- the fly. This form of compression is especially good for encoding
- files which contain repeated strings, and can often give dramatic
- improvements over traditional Huffman SQueezing.
-
- Language:
- Computer Innovations Optimizing C86
-
- Programming notes:
- In this section I am drawing heavily on the LZWCOM and LZWUNC
- programs by Kent Williams of Cedar Rapids Iowa. This section
- is mainly his code, and I don't pretend to understand everything
- that's going on, so please refer any technical questions to him.
-
- As best as I can tell, this method works by tracing down a hash
- table of code strings where each entry has the property:
-
- if <string> <char> is in the table
- then <string> is in the table.
- */
- #include <stdio.h>
-
- long lzwlen; /* length after crunching */
-
- #define FALSE 0
- #define TRUE !FALSE
- #define TABSIZE 4096
- #define NO_PRED 0xFFFF
- #define EMPTY 0xFFFF
- #define NOT_FND 0xFFFF
-
- static unsigned int inbuf; /* partial input code storage */
- static unsigned int outbuf; /* partial output code storage */
-
- static unsigned char stack[TABSIZE]; /* local push and pop stack */
- static int sp; /* current stack pointer */
-
- static struct entry /* string table entry format */
- { char used; /* true when this entry is in use */
- unsigned int next; /* ptr to next in collision list */
- unsigned int predecessor; /* code for preceeding string */
- unsigned char follower; /* char following string */
- } string_tab[TABSIZE]; /* the code string table */
-
- /* The h() routine is used to calculate a hash value, using the
- "mid-square" algorithm. That is, a primitive hash value is
- calculated by taking the middle twelve bits of the square of a key.
- */
-
- static unsigned h(pred,foll) /* calculate basic hash value */
- unsigned int pred; /* code for preceeding string */
- unsigned char foll; /* value of following char */
- {
- long local; /* local hash value */
-
- local = (pred + foll) | 0x0800; /* create the hash key */
- local *= local; /* square it */
- return (local >> 6) & 0x0FFF; /* return the middle 12 bits */
- }
-
- /* The eolist() function is used to trace down a list of entries with
- duplicate keys until the last duplicate is found.
- */
-
- static unsigned eolist(index) /* find last duplicate */
- unsigned int index;
- {
- int temp;
-
- while(temp=string_tab[index].next) /* while more duplicates */
- index = temp;
-
- return index;
- }
-
- /* The hash() routine is used to find a spot in the hash table for a new
- entry. It performs a "hash and linear probe" lookup, using h() to
- calculate the starting hash value and eolist() to perform the linear
- probe. This routine DOES NOT detect a table full condition. That
- MUST be checked for elsewhere.
- */
-
- static unsigned hash(pred,foll) /* find spot in the string table */
- unsigned int pred; /* code for preceeding string */
- unsigned char foll; /* char following string */
- {
- unsigned int local, tempnext; /* scratch storage */
- struct entry *ep; /* allows faster table handling */
-
- local = h(pred,foll); /* get initial hash value */
-
- if(!string_tab[local].used) /* if that spot is free */
- return local; /* then that's all we need */
-
- else /* else a collision has occured */
- { local = eolist(local); /* move to last duplicate */
-
- /* We must find an empty spot. We start looking 101 places
- down the table from the last duplicate.
- */
-
- tempnext = (local+101) & 0x0FFF;
- ep = &string_tab[tempnext]; /* initialize pointer */
-
- while(ep->used) /* while empty spot not found */
- { if(++tempnext==TABSIZE) /* if we are at the end */
- { tempnext = 0; /* wrap to beginning of table*/
- ep = string_tab;
- }
- else ++ep; /* point to next element in table */
- }
-
- /* local still has the pointer to the last duplicate, while
- tempnext has the pointer to the spot we found. We use
- this to maintain the chain of pointers to duplicates.
- */
-
- string_tab[local].next = tempnext;
-
- return tempnext;
- }
- }
-
- /* The unhash() function is used to search the hash table for a given key.
- Like hash(), it performs a hash and linear probe search. It returns
- either the number of the entry (if found) or NOT_FND (if not found).
- */
-
- static unsigned unhash(pred,foll) /* search string table for a key */
- unsigned int pred; /* code of preceeding string */
- unsigned char foll; /* character following string */
- {
- unsigned int local, offset; /* scratch storage */
- struct entry *ep; /* this speeds up access */
-
- local = h(pred,foll); /* initial hash */
-
- while(1)
- { ep = &string_tab[local]; /* speed up table access */
-
- if((ep->predecessor==pred) && (ep->follower==foll))
- return local; /* we have a match */
-
- if(!ep->next) /* if no more duplicates */
- return NOT_FND; /* then key is not listed */
-
- local = ep->next; /* move on to next duplicate */
- }
- }
-
- /* The init_tab() routine is used to initialize our hash table.
- You realize, of course, that "initialize" is a complete misnomer.
- */
-
- static init_tab() /* set ground state in hash table */
- {
- unsigned int i; /* table index */
-
- setmem((char *)string_tab,sizeof(string_tab),0);
-
- for(i=0; i<256; i++) /* list all single byte strings */
- upd_tab(NO_PRED,i);
-
- inbuf = outbuf = EMPTY; /* nothing is in our buffers */
- }
-
- /* The upd_tab routine is used to add a new entry to the string table.
- As previously stated, no checks are made to ensure that the table
- has any room. This must be done elsewhere.
- */
-
- upd_tab(pred,foll) /* add an entry to the table */
- unsigned int pred; /* code for preceeding string */
- unsigned int foll; /* character which follows string */
- {
- struct entry *ep; /* pointer to current entry */
-
- /* calculate offset just once */
-
- ep = &string_tab[hash(pred,foll)];
-
- ep->used = TRUE; /* this spot is now in use */
- ep->next = 0; /* no duplicates after this yet */
- ep->predecessor = pred; /* note code of preceeding string */
- ep->follower = foll; /* note char after string */
- }
-
- /* This algorithm encodes a file into twelve bit strings (three nybbles).
- The getcode() and putcode() routines are used to output these strings
- a byte (or two) at a time.
- */
-
- static getcode(fd) /* read in a twelve bit code */
- FILE *fd; /* file to get code from */
- {
- unsigned int localbuf, returnval;
-
- if(inbuf==EMPTY) /* if on a code boundary */
- { if((localbuf=getc_unp(fd))==EOF) /* get start of next code */
- return EOF; /* pass back end of file status */
- localbuf &= 0xFF; /* mask down to true byte value */
- if((inbuf=getc_unp(fd))==EOF) /* get end of code, start of next */
- return EOF; /* this should never happen */
- inbuf &= 0xFF; /* mask down to true byte value */
-
- returnval = ((localbuf<<4)&0xFF0) + ((inbuf>>4)&0x00F);
- inbuf &= 0x000F; /* leave partial code pending */
- }
-
- else /* buffer contains first nybble */
- { if((localbuf=getc_unp(fd))==EOF)
- return EOF;
- localbuf &= 0xFF;
-
- returnval = localbuf + ((inbuf<<8)&0xF00);
- inbuf = EMPTY; /* note no hanging nybbles */
- }
- return returnval; /* pass back assembled code */
- }
-
- static putcode(fd,code) /* output a twelve bit code */
- FILE *fd; /* file to send code to */
- unsigned int code; /* code to send */
- {
- int localbuf;
-
- if(outbuf==EMPTY) /* if no hanging nybbles */
- { putch(((code>>4)&0xFF),fd); /* then output first two nybbles */
- outbuf = code & 0x000F; /* and save third for later */
- }
- else /* else dump pending nybble */
- { putch((((outbuf<<4)&0xFF0) + ((code>>8)&0x00F)),fd);
- putch((code&0x00FF),fd); /* last two nybbles can go also */
- outbuf = EMPTY; /* and no nybbles left hanging */
- }
- }
-
- /* The putch() routine is used to write bytes to the output file,
- as certain checks must be performed.
- */
-
- static putch(c,fp) /* output a byte value */
- char c; /* the byte value to put */
- FILE *fp; /* the file to output to */
- {
- lzwlen++; /* bump our length counter */
-
- if(fp) /* if output file is real */
- if(fputc(c,fp)==EOF) /* then make sure this works */
- abort("Write fail (disk full?)");
- }
-
- /* The flushout() routine must be called to flush out any partial code
- which gets left behind.
- */
-
- static flushout(fd) /* flush out the final code */
- FILE *fd; /* file to send code to */
- {
- if(outbuf!=EMPTY) /* dump any hanging nybble */
- putch(((outbuf<<4)&0xFF0),fd);
- }
-
- static push(c) /* push char onto stack */
- int c; /* character to push */
- {
- stack[sp] = ((char) c); /* coerce integer into a char */
-
- if(++sp >= TABSIZE)
- abort("Stack overflow\n");
- }
-
- static int pop() /* pop character from stack */
- {
- if(sp>0)
- return ((int) stack[--sp]); /* leave ptr at next empty slot */
-
- else return EMPTY;
- }
-
- /***** LEMPEL-ZEV DECOMPRESSION *****/
-
- static int code_count; /* needed to detect table full */
- static unsigned code; /* where we are so far */
- static int firstc; /* true only on first character */
-
- init_ucr() /* get set for uncrunching */
- {
- sp = 0; /* clear out the stack */
- init_tab(); /* set up atomic code definitions */
- code_count = TABSIZE - 256; /* note space left in table */
- firstc = 1; /* true only on first code */
- }
-
- int getc_ucr(f) /* get next uncrunched byte */
- FILE *f; /* file containing crunched data */
- {
- unsigned int c; /* a character of input */
- int code, newcode;
- static int oldcode, finchar;
- struct entry *ep; /* allows faster table handling */
-
- if(firstc) /* first code is always known */
- { firstc = FALSE; /* but next will not be first */
- oldcode = getcode(f);
- return finchar = string_tab[oldcode].follower;
- }
-
- if(!sp) /* if stack is empty */
- { if((code=newcode=getcode(f))==EOF)
- return EOF;
-
- ep = &string_tab[code]; /* initialize pointer */
-
- if(!ep->used) /* if code isn't known */
- { code = oldcode;
- ep = &string_tab[code]; /* re-initialize pointer */
- push(finchar);
- }
-
- while(ep->predecessor!=NO_PRED)
- { push(ep->follower); /* decode string backwards */
- code = ep->predecessor;
- ep = &string_tab[code];
- }
-
- push(finchar=ep->follower); /* save first character also */
-
- /* The above loop will terminate, one way or another,
- with string_tab[code].follower equal to the first
- character in the string.
- */
-
- if(code_count) /* if room left in string table */
- { upd_tab(oldcode,finchar);
- --code_count;
- }
-
- oldcode = newcode;
- }
-
- return pop(); /* return saved character */
- }
-
-
- /***** LEMPEL-ZEV COMPRESSION *****/
-
- init_cr() /* initialize for Lempel-Zev pack */
- {
- init_tab(); /* set string table ground state */
- code_count = TABSIZE - 256; /* set initial table space */
- firstc = 1; /* set up special case */
- lzwlen = 0; /* reset length counter */
- }
-
- putc_cr(c,t) /* output a byte crunched */
- char c; /* character to crunch */
- FILE *t; /* file to write to */
- {
- unsigned localcode;
-
- if(firstc) /* special case for first char */
- { code = unhash(NO_PRED,c); /* set initial code for table */
- firstc = 0; /* no longer first char */
- return;
- }
-
- if((localcode=unhash(code,c))!=NOT_FND)
- { code = localcode;
- return;
- }
-
- /* When the above clause comes false, we have found the code for
- the longest known code string.
- */
-
- putcode(t,code); /* save code for string thus far */
-
- if(code_count) /* if more room in string table */
- { upd_tab(code,c); /* then add the new string */
- --code_count;
- }
-
- /* Now we start the loop again, starting with the character
- that didn't fit in the last string.
- */
-
- code = unhash(NO_PRED,c);
- }
-
- fini_cr(t) /* finish up after crunching */
- FILE *t; /* file to dump to */
- {
- putcode(t,code); /* always one unsent code left */
-
- flushout(t); /* make sure everything's written */
- }
-